This notebook goes through step by step how to save off an RDD Schema

Doing this will sigificantly decrease the amount of time necessary to load in a json file



In [7]:

    
from pyspark.sql.types import StructType
import json



In [1]:

    
data_path = "wiki_edit_data.json"
#read in the data, sadly without a schema
wiki_edits = sqlCtx.read.json(data_path)



In [14]:

    
wiki_edits.printSchema()









    



root
 |-- article_id: long (nullable = true)
 |-- article_namespace: long (nullable = true)
 |-- article_title: string (nullable = true)
 |-- comment: string (nullable = true)
 |-- minor: boolean (nullable = true)
 |-- parent_id: long (nullable = true)
 |-- redirect_target: string (nullable = true)
 |-- revision_id: long (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- user_name: string (nullable = true)



In [2]:

    
#original schema
wiki_edits.schema









    Out[2]:





StructType(List(StructField(article_id,LongType,true),StructField(article_namespace,LongType,true),StructField(article_title,StringType,true),StructField(comment,StringType,true),StructField(minor,BooleanType,true),StructField(parent_id,LongType,true),StructField(redirect_target,StringType,true),StructField(revision_id,LongType,true),StructField(timestamp,StringType,true),StructField(user_id,LongType,true),StructField(user_name,StringType,true)))



In [3]:

    
s = wiki_edits.schema.json()
s









    Out[3]:





'{"fields":[{"metadata":{},"name":"article_id","nullable":true,"type":"long"},{"metadata":{},"name":"article_namespace","nullable":true,"type":"long"},{"metadata":{},"name":"article_title","nullable":true,"type":"string"},{"metadata":{},"name":"comment","nullable":true,"type":"string"},{"metadata":{},"name":"minor","nullable":true,"type":"boolean"},{"metadata":{},"name":"parent_id","nullable":true,"type":"long"},{"metadata":{},"name":"redirect_target","nullable":true,"type":"string"},{"metadata":{},"name":"revision_id","nullable":true,"type":"long"},{"metadata":{},"name":"timestamp","nullable":true,"type":"string"},{"metadata":{},"name":"user_id","nullable":true,"type":"long"},{"metadata":{},"name":"user_name","nullable":true,"type":"string"}],"type":"struct"}'



In [4]:

    
type(wiki_edits.schema.json())









    Out[4]:





str



In [5]:

    
#now the data is in a string format, to be able to dump it we really want it in a json format
#for this we use json.loads()



In [8]:

    
with open('wiki_schema.json', 'w') as f:
    json.dump(json.loads(s), f)



In [15]:

    
#file is saved!  Lets make sure if worked by loading it back in :)



In [9]:

    
with open('wiki_schema.json', 'r') as f:
     json_in = json.load(f)



In [10]:

    
#what does the current format look like now?
json_in









    Out[10]:





{u'fields': [{u'metadata': {},
   u'name': u'article_id',
   u'nullable': True,
   u'type': u'long'},
  {u'metadata': {},
   u'name': u'article_namespace',
   u'nullable': True,
   u'type': u'long'},
  {u'metadata': {},
   u'name': u'article_title',
   u'nullable': True,
   u'type': u'string'},
  {u'metadata': {},
   u'name': u'comment',
   u'nullable': True,
   u'type': u'string'},
  {u'metadata': {}, u'name': u'minor', u'nullable': True, u'type': u'boolean'},
  {u'metadata': {},
   u'name': u'parent_id',
   u'nullable': True,
   u'type': u'long'},
  {u'metadata': {},
   u'name': u'redirect_target',
   u'nullable': True,
   u'type': u'string'},
  {u'metadata': {},
   u'name': u'revision_id',
   u'nullable': True,
   u'type': u'long'},
  {u'metadata': {},
   u'name': u'timestamp',
   u'nullable': True,
   u'type': u'string'},
  {u'metadata': {}, u'name': u'user_id', u'nullable': True, u'type': u'long'},
  {u'metadata': {},
   u'name': u'user_name',
   u'nullable': True,
   u'type': u'string'}],
 u'type': u'struct'}



In [11]:

    
#to be able to use we need to have a StructType again
schema_in = StructType.fromJson(json_in)
type(schema_in)









    Out[11]:





pyspark.sql.types.StructType



In [12]:

    
#read in the data again, this time with the schema!!!!
wiki_edits2 = sqlCtx.read.json(data_path, schema=StructType.fromJson(json_in))



In [13]:

    
#Now loading is a whole lot faster!!! but we should check the schema just to make sure the trolls didn't eat anything
wiki_edits2.printSchema()









    



root
 |-- article_id: long (nullable = true)
 |-- article_namespace: long (nullable = true)
 |-- article_title: string (nullable = true)
 |-- comment: string (nullable = true)
 |-- minor: boolean (nullable = true)
 |-- parent_id: long (nullable = true)
 |-- redirect_target: string (nullable = true)
 |-- revision_id: long (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- user_name: string (nullable = true)



In [ ]: